-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MAT
> CHING_PC/AFQT_MATCHING_PC_with_weights.log
  log type:  text
 opened on:  14 Jan 2014, 13:38:25

. 
. /***************************************************
> MATCHING AFQT SCORES ACROSS NLSY 1979 and NLSY 1997
> 
> This do file creates a data file with comparable AFQT scores across both NLSY
> 79 and NLSY97.
> There are two main steps in creating the comparable AFQT scores:
> 
> 1. The 1979 ASVAB is a Paper and Pencil (P&P) test, while the 1997 ASVAB was 
> computer adminstered. 
> To make the scores comparable across cohorts, we rely on a percentile mapping
>  provided by Dan Segall (Segall (1997))
> 
> 2. The age at which respondents took the test differs between 1979 and 1997. 
> The 1997 sample is much younger.
> For both samples, we observe a large sample of individuals taking the test at
>  age 16. We use this overlap in the 
> test-taking age by mapping all test scores within cohorts into the age 16-dis
> tribution based on the within age 
> ranking of test scores. 
> 
> For details, see Segall (1997) and Altonji, Bharadwaj & Lange (2009).
> 
> Altonji, J., Bharadwaj, P. & Lange, F. "Changes in the Characteristics of Ame
> rican Youth - 
> Implications for Adult Outcomes" NBER Working Papers No. 13883, revised 2009.
> Segall, D. O. (1997). "Equating the CAT-ASVAB". In W. A. Sands, B. K. Waters,
>  & J. R. McBride (Eds.), 
>         Computerized adaptive testing: From inquiry to operation (pp. 181-198
> ). Washington, DC: American Psychological Association. 
> Date: August 19, 2009.
> ******************************************************/
. 
. // Set path to directory containing afqt1997a.csv //
. capture cd "/afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MA
> TCHING_PC"

. * cd "/Users/JKukkur/Documents/Research/ABL/AFQT MATCHING"
. 
. tempfile afqt97 afqt_append nlsy_agestd agestd_afqt missings finished_product

. 
. /****************************************************************
> First Step of score conversion: 
> Transfrom CAT Test Scores from NLSY97 into Paper and Pencil Test Scores using
>  mapping provided by Dan Segall.
> Combine this data with raw data from NLSY79.
> *****************************************************************/
. 
. 
. // afqt1997a.csv contains individual id's and sex from NLSY1997 and the ASVAB
>  component scores provided by Dan Segall. 
. // The are P&P equivalent scores based on the mapping procedure described in 
> Segall (1996). 
. // Dan Segall suuplied us with these P&P equivalent scores using ASVAB compon
> ent scores contained in (DICTIONARY FILE).
. insheet using afqt1997a.csv, comma
(36 vars, 8984 obs)

. ren v1 pid

. ren v2 male

. gen asvabPC=pc if pc!=0 // pc=paragraph comprehension, no=numerical comprehen
> sion 
(1881 missing values generated)

. keep pid male asvabPC

. sort pid

. save `afqt97', replace
(note: file /tmp/90665.1.all.q/St10448.000004 not found)
file /tmp/90665.1.all.q/St10448.000004 saved

. 
. // Merge age in for the NLSY97 sample //
. infile using age97.dct, clear

infile dictionary {
  R0000100 "PUBID - YTH ID CODE 1997"
  R1194100 "CV_AGE_INT_DATE 1997"
  R2553500 "CV_AGE_INT_DATE 1998"
  R3876300 "CV_AGE_INT_DATE 1999"
  R5453700 "CV_AGE_INT_DATE 2000"
  R7216000 "CV_AGE_INT_DATE 2001"
  S1531400 "CV_AGE_INT_DATE 2002"
  S2001000 "CV_AGE_INT_DATE 2003"
  S3801100 "CV_AGE_INT_DATE 2004"
}

(8984 observations read)

. ren R0000100    pid

. ren R1194100    age  // age as of 1997 (test-taking year for NLSY97) //

. keep pid age

. sort pid

. merge pid using `afqt97' 
(note: you are using old merge syntax; see [D] merge for new syntax)

. drop _merge 

. sort pid

. save `afqt97', replace
file /tmp/90665.1.all.q/St10448.000004 saved

. 
. // Merge in weights for 1997 data //
. // We use the custom weight provided by the NLSY for the year 1997, the year 
> when the ASVAB was administered. //
. insheet using weights97.csv, clear
(2 vars, 8984 obs)

. sort pid

. merge pid using `afqt97'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge 

. gen sample=1                            // Sample Identifier: 1= 1997 NLSY sa
> mple, 0=1979 NLSY sample //

. sort sample pid

. save `afqt97', replace
file /tmp/90665.1.all.q/St10448.000004 saved

. 
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. infile using y79_asvab.dct, clear

infile dictionary {
  R0000100 "ID# (1-12686) 79"
  R0000500 "DATE OF BIRTH - YR 79"
  R0173600 "SAMPLE ID  79 INT"
  R0406510 "AGE OF R @ INT DATE 80"
  R0618010 "PROFILES ASVAB SEC 1-STD SCRNR 81"
  R0618011 "PROFILES ASVAB SEC 2-STD SCRNR 81"
  R0618012 "PROFILES ASVAB SEC 3-STD SCRNR 81"
  R0618013 "PROFILES ASVAB SEC 4-STD SCRNR 81"
  R0618014 "PROFILES ASVAB SEC 5-STD SCRNR 81"
  R0618015 "PROFILES ASVAB SEC 6-STD SCRNR 81"
  R0618016 "PROFILES ASVAB SEC 7-STD SCRNR 81"
  R0618017 "PROFILES ASVAB SEC 8-STD SCRNR 81"
  R0618018 "PROFILES ASVAB SEC 9-STD SCRNR 81"
  R0618019 "PROFILES ASVAB SEC 10-STD SCRNR 81"
  R0618100 "PROFILES ASVAB VRBL-STD SCRNR 81"
  R0618200 "PROFILES AFQT PRCTILE SCORE-80 81"
  R0618300 "PROFILES AFQT PRCTILE 89 (REV) 81"
  R0618301 "PROFILES AFQT PRCTILE 2006 (REV) 81"
}

(12686 observations read)

. ren R0000100 pid                

. ren R0000500 birthyear 

. ren R0173600 sampid

. ren R0406510 age                                                             
>            // Age as of 1980 (test taking year for NLSY79) //

. 
. ren R0618010 gs

. ren R0618011 ar

. ren R0618012 wk

. ren R0618013 pc

. ren R0618014 no

. ren R0618015 cs

. ren R0618016 as

. ren R0618017 mk

. ren R0618018 mc

. ren R0618019 ei

. 
. ren R0618100 asvabVerb

. ren R0618200 AFQT_1

. ren R0618300 AFQT_2

. ren R0618301 AFQT_3

. 
. foreach var in gs ar wk pc no cs as mk mc ei {
  2.         qui replace `var'=. if `var'<0
  3. }

. gen asvabPC = pc if pc!=0
(808 missing values generated)

. drop if asvabPC==.
(808 observations deleted)

. label var asvabPC "Arithmetic Reasoning subset of ASVAB score"

. replace age=80-birthyear if age<0 & birthyear!=.        // Fill missing age u
> sing birth-year // 
(202 real changes made)

. drop birthyear

. gen sample=0                                                    // Sample Ide
> ntifier: 1= 1997 NLSY sample, 0=1979 NLSY sample //

. append using `afqt97'                           // Append the data-set for NL
> SY97 //

. sort sample pid

. save `afqt_append', replace
(note: file /tmp/90665.1.all.q/St10448.000005 not found)
file /tmp/90665.1.all.q/St10448.000005 saved

. 
. * Merge in 1979 weights
. // We use the custom weight provided by the NLSY for the year 1979, the year 
> when the ASVAB was administered. //
. 
. insheet using weights79.csv, clear
(2 vars, 12686 obs)

. gen sample=0

. sort sample pid

. merge sample pid using `afqt_append'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge

. 
. * The weights have implied 2 decimal places.
. replace weight=weight/100
weight was long now double
(21670 real changes made)

. sort sample pid

. save `afqt_append', replace
file /tmp/90665.1.all.q/St10448.000005 saved

. 
. /***************************************************************************
> Second Step: Percentile mapping of P&P test scores into age=16 distribution
> ****************************************************************************/
. 
. *****           GENERATING PERCENTILES OF SCORES BY AGE AND NLSY-SAMPLE *****
> ****
. use `afqt_append', clear

. drop gs ar wk pc no cs as mk mc ei asvabVerb AFQT_1 AFQT_2 AFQT_3

. 
. // Drop those with missing AFQT PC scores
. drop if asvabPC==.
(2689 observations deleted)

. 
. // For Table I in Introduction.doc
. bysort sample: tab age

-------------------------------------------------------------------------------
-> sample = 0

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         15 |        962        8.10        8.10
         16 |      1,511       12.72       20.82
         17 |      1,488       12.53       33.35
         18 |      1,432       12.06       45.40
         19 |      1,502       12.65       58.05
         20 |      1,558       13.12       71.17
         21 |      1,539       12.96       84.12
         22 |      1,529       12.87       96.99
         23 |        357        3.01      100.00
------------+-----------------------------------
      Total |     11,878      100.00

-------------------------------------------------------------------------------
-> sample = 1

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         12 |        964       13.57       13.57
         13 |      1,409       19.84       33.41
         14 |      1,480       20.84       54.24
         15 |      1,492       21.01       75.25
         16 |      1,325       18.65       93.90
         17 |        430        6.05       99.96
         18 |          3        0.04      100.00
------------+-----------------------------------
      Total |      7,103      100.00


. 
. 
. // For Figure 1 in NSLY79
. kdensity asvabPC if sample==0&age==16, addplot(kdensity asvabPC if sample==1&
> age==16, lpattern(_)) title(Figure 1: AFQT Scores at Age 16) ///
> note("The NLSY79-scores are the P&P scores reported by the NLSY79." "The NLSY
> -97 scores are based on the CAT scores from NLSY97 and the equation by Segal 
> (1997)." "Both populations are weighted to be population representative.") //
> /
> legend(label(1 "NLSY 1979") label(2 "NLSY 1997") cols(2)) saving(HistScores, 
> replace)
(file HistScores.gph saved)

. 
. 
. // Combine sparsely populated age-groups in both samples with adjacent age-gr
> oups //
. replace age=22 if age==23 & sample==0
(357 real changes made)

. replace age=17 if age==18 & sample==1
(3 real changes made)

. 
. *******************************************************************
. *EXPANDING THE DATA AND CREATING PERCENTILES BY HAND
. // The following procedure improves the quality of the percentile mapping. 
. // The problem is that some observations 'belong' in several percentiles beca
> use they 
. // have large weights. Stata commands such as xtile will simply assign a uniq
> ue percentile to these
. // observations. Instead, we need to account for the fact that these observat
> ions belong to several pctiles. 
. // This is achieved by expanding the data-set proportionally to the weights a
> nd then generating percentiles. 
. 
. *expanding each observation by its weight - an observation with a weight of 1
> 100 is expanded into 11 observations. 
. gen percentile_rank=.
(18981 missing values generated)

. replace weight=round(weight/100)
(18981 real changes made)

. foreach num of numlist 0/160 {
  2.         qui expand `num' if weight==`num'
  3. }

. 
. *generating a unique rank within each sample and age
. bysort sample age: egen r=rank(asvabPC), u

. gen pasvabPC=.
(471020 missing values generated)

. 
. * Divide the rank by number of individuals corresponding to the population of
>  a given age and sample
. * to get the percentile of an individual.
. 
. gen holdey = 1/100

. bysort sample age: egen sumWgts=total(holdey)

. 
. *SAMPLE==0
. qui replace pasvabPC=round(r/sumWgts) if age==15 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==16 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==17 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==18 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==19 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==20 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==21 & sample==0

. qui replace pasvabPC=round(r/sumWgts) if age==22 & sample==0

. 
. *SAMPLE 1
. qui replace pasvabPC=round(r/sumWgts) if age==12 & sample==1

. qui replace pasvabPC=round(r/sumWgts) if age==13 & sample==1

. qui replace pasvabPC=round(r/sumWgts) if age==14 & sample==1

. qui replace pasvabPC=round(r/sumWgts) if age==15 & sample==1

. qui replace pasvabPC=round(r/sumWgts) if age==16 & sample==1

. qui replace pasvabPC=round(r/sumWgts) if age==17 & sample==1

. 
. *dropping the duplicates now
. egen tag=tag(sample pid)

. keep if tag==1
(452039 observations deleted)

. drop tag

. 
. sort sample pasvabPC

. save `nlsy_agestd', replace
(note: file /tmp/90665.1.all.q/St10448.000006 not found)
file /tmp/90665.1.all.q/St10448.000006 saved

. 
. *****************************************************************************
. 
. // Within sample, we map AFQT scores by age in the age=16
. // distribution. We therefore require mean AFQT-scores of age=16 by percentil
> e in each sample. 
. // We need to generate these averages using the weights.
. bys sample pasvabPC: egen pop=sum(weight) if age==16
(16145 missing values generated)

. bys sample pasvabPC: egen tot_score=sum(asvabPC*weight) if age==16      // Me
> an age=16 raw score for each percentile //
(16145 missing values generated)

. bys sample pasvabPC: gen mean=tot_score/pop if age==16
(16145 missing values generated)

. drop tot_score pop

. egen tag=tag(sample pasvabPC mean) if age==16                   // We need on
> ly 1 obs per sample and percentile //

. keep if tag==1
(18779 observations deleted)

. ren mean asvabPC_std

. keep sample asvabPC_std pasvabPC asvabPC

. sort sample pasvabPC

. save `agestd_afqt', replace
(note: file /tmp/90665.1.all.q/St10448.000007 not found)
file /tmp/90665.1.all.q/St10448.000007 saved

. 
. use `nlsy_agestd', clear

. merge sample pasvabPC using `agestd_afqt' // Merge into each percentile the a
> ge=16 corresponding score //
(note: you are using old merge syntax; see [D] merge for new syntax)
variables sample pasvabPC do not uniquely identify observations in the master
    data

. drop _merge

. keep sample pid male pasvabPC asvabPC_std asvabPC weight age

. sort sample pid

. // The final data contains "asvabPC_std": which is the comparable score acros
> s the 2 NLSY samples //
. *keep if sample==1 // this drops the NLSY79 observations //
. ren pid ID

. ren weight weight_altonji

. gen year = 1979 if sample==0
(7103 missing values generated)

. replace year = 1997 if sample==1 
(7103 real changes made)

. *drop sample age asvabPC pasvabPC // drop the other variables since I don't u
> se them in my research //
. preserve

.         use `afqt97', clear

.         keep pid male

.         ren pid ID

.         save `missings', replace
(note: file /tmp/90665.1.all.q/St10448.000008 not found)
file /tmp/90665.1.all.q/St10448.000008 saved

. restore

. 
. preserve

.         keep if sample==1
(11878 observations deleted)

.         merge 1:1 ID using `missings'

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,881
        from master                         0  (_merge==1)
        from using                      1,881  (_merge==2)

    matched                             7,103  (_merge==3)
    -----------------------------------------

.         tab _merge

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
         using only (2) |      1,881       20.94       20.94
            matched (3) |      7,103       79.06      100.00
------------------------+-----------------------------------
                  Total |      8,984      100.00

.         zscore asvabPC_std
z_asvabPC_std created with 1881 missing values

.         drop asvabPC_std

.         ren z_asvabPC_std asvabPC_std

.         keep if male==1
(4385 observations deleted)

.         sort ID

.         outsheet ID asvabPC_std using altonjiAFQT.csv, comma nol replace

. restore

. save afqt_adjusted_final, replace 
file afqt_adjusted_final.dta saved

. save `finished_product', replace
(note: file /tmp/90665.1.all.q/St10448.000009 not found)
file /tmp/90665.1.all.q/St10448.000009 saved

. 
. tabstat asvabPC_std, by(sample) stats(n min max mean median)

Summary for variables: asvabPC_std
     by categories of: sample 

  sample |         N       min       max      mean       p50
---------+--------------------------------------------------
       0 |     11878        20        62  44.90718        47
       1 |      7103        20        62  44.71777        47
---------+--------------------------------------------------
   Total |     18981        20        62   44.8363        47
------------------------------------------------------------

. 
. isid ID year

. 
. * Test: the afqt distributions across age within sample should now be identic
> al. There will still be very small deviations
. * because of the coarseness of the above expansion, but we believe these to b
> e third order. To allow this code to run 
. * rapidly, we tolerate these deviations. 
. 
. **bys sample age: sum asvabPC_std [fw=weight], d
. 
. use `finished_product', clear

. ** Generate a NLSY79-only supplement:
. drop if sample==1
(7103 observations deleted)

. save afqt_adjusted_final79, replace
file afqt_adjusted_final79.dta saved

. 
. use `finished_product', clear

. ** Generate a NLSY97-only supplement:
. drop if sample==0
(11878 observations deleted)

. save afqt_adjusted_final97, replace
file afqt_adjusted_final97.dta saved

. 
. log close
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MAT
> CHING_PC/AFQT_MATCHING_PC_with_weights.log
  log type:  text
 closed on:  14 Jan 2014, 13:38:35
-------------------------------------------------------------------------------
